{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-04-28T06:43:10.313758800Z",
     "start_time": "2024-04-28T06:41:57.970289500Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/masaikk/miniconda3/envs/11m10/lib/python3.10/site-packages/torch/cuda/__init__.py:628: UserWarning: Can't initialize NVML\n",
      "  warnings.warn(\"Can't initialize NVML\")\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==((====))==  Unsloth: Fast Llama patching release 2024.4\n",
      "   \\\\   /|    GPU: NVIDIA GeForce RTX 2080 Ti. Max memory: 11.0 GB. Platform = Linux.\n",
      "O^O/ \\_/ \\    Pytorch: 2.2.2. CUDA = 7.5. CUDA Toolkit = 12.1.\n",
      "\\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.\n",
      " \"-____-\"     Free Apache license: http://github.com/unslothai/unsloth\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "from unsloth import FastLanguageModel\n",
    "import torch\n",
    "max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n",
    "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n",
    "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n",
    "\n",
    "# 4bit pre quantized models we support for 4x faster downloading + no OOMs.\n",
    "fourbit_models = [\n",
    "    \"unsloth/mistral-7b-bnb-4bit\",\n",
    "    \"unsloth/mistral-7b-instruct-v0.2-bnb-4bit\",\n",
    "    \"unsloth/llama-2-7b-bnb-4bit\",\n",
    "    \"unsloth/gemma-7b-bnb-4bit\",\n",
    "    \"unsloth/gemma-7b-it-bnb-4bit\", # Instruct version of Gemma 7b\n",
    "    \"unsloth/gemma-2b-bnb-4bit\",\n",
    "    \"unsloth/gemma-2b-it-bnb-4bit\", # Instruct version of Gemma 2b\n",
    "    \"unsloth/llama-3-8b-bnb-4bit\", # [NEW] 15 Trillion token Llama-3\n",
    "] # More models at https://huggingface.co/unsloth\n",
    "\n",
    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
    "    model_name = \"unsloth/llama-3-8b-bnb-4bit\",\n",
    "    max_seq_length = max_seq_length,\n",
    "    dtype = dtype,\n",
    "    load_in_4bit = load_in_4bit,\n",
    "    # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.\n"
     ]
    }
   ],
   "source": [
    "model = FastLanguageModel.get_peft_model(\n",
    "    model,\n",
    "    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n",
    "    target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
    "                      \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
    "    lora_alpha = 16,\n",
    "    lora_dropout = 0, # Supports any, but = 0 is optimized\n",
    "    bias = \"none\",    # Supports any, but = \"none\" is optimized\n",
    "    # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n",
    "    use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for very long context\n",
    "    random_state = 3407,\n",
    "    use_rslora = False,  # We support rank stabilized LoRA\n",
    "    loftq_config = None, # And LoftQ\n",
    ")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-28T06:43:47.572813300Z",
     "start_time": "2024-04-28T06:43:46.653664400Z"
    }
   },
   "id": "fb587e55cc7b7599",
   "execution_count": 2
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "Generating train split: 0 examples [00:00, ? examples/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "ebf50c322f8f4085bf9e540fb6f66fb1"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "Map:   0%|          | 0/51760 [00:00<?, ? examples/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "9a73288a851c4c1a9067ccdc5752a374"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "alpaca_prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n",
    "\n",
    "### Instruction:\n",
    "{}\n",
    "\n",
    "### Input:\n",
    "{}\n",
    "\n",
    "### Response:\n",
    "{}\"\"\"\n",
    "\n",
    "EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN\n",
    "def formatting_prompts_func(examples):\n",
    "    instructions = examples[\"instruction\"]\n",
    "    inputs       = examples[\"input\"]\n",
    "    outputs      = examples[\"output\"]\n",
    "    texts = []\n",
    "    for instruction, input, output in zip(instructions, inputs, outputs):\n",
    "        # Must add EOS_TOKEN, otherwise your generation will go on forever!\n",
    "        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN\n",
    "        texts.append(text)\n",
    "    return { \"text\" : texts, }\n",
    "pass\n",
    "\n",
    "from datasets import load_dataset\n",
    "dataset = load_dataset(r\"/mnt/j/coding/llm/cmbai/pretrain/data/alpaca\", split = \"train\")\n",
    "dataset = dataset.map(formatting_prompts_func, batched = True,)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-28T07:05:10.300535200Z",
     "start_time": "2024-04-28T07:05:09.428006200Z"
    }
   },
   "id": "21361f47df6bf18f",
   "execution_count": 5
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "739c173653804c5baba490b157fa6400"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "max_steps is given, it will override any value given in num_train_epochs\n"
     ]
    }
   ],
   "source": [
    "from trl import SFTTrainer\n",
    "from transformers import TrainingArguments\n",
    "\n",
    "trainer = SFTTrainer(\n",
    "    model = model,\n",
    "    tokenizer = tokenizer,\n",
    "    train_dataset = dataset,\n",
    "    dataset_text_field = \"text\",\n",
    "    max_seq_length = max_seq_length,\n",
    "    dataset_num_proc = 2,\n",
    "    packing = False, # Can make training 5x faster for short sequences.\n",
    "    args = TrainingArguments(\n",
    "        per_device_train_batch_size = 2,\n",
    "        gradient_accumulation_steps = 4,\n",
    "        warmup_steps = 5,\n",
    "        max_steps = 60,\n",
    "        learning_rate = 2e-4,\n",
    "        fp16 = not torch.cuda.is_bf16_supported(),\n",
    "        bf16 = torch.cuda.is_bf16_supported(),\n",
    "        logging_steps = 1,\n",
    "        optim = \"adamw_8bit\",\n",
    "        weight_decay = 0.01,\n",
    "        lr_scheduler_type = \"linear\",\n",
    "        seed = 3407,\n",
    "        output_dir = \"outputs\",\n",
    "    ),\n",
    ")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-28T07:06:01.482056800Z",
     "start_time": "2024-04-28T07:05:52.968185100Z"
    }
   },
   "id": "22b0f73c60611dd1",
   "execution_count": 6
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "\n    <div>\n      \n      <progress value='2' max='60' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [ 2/60 : < :, Epoch 0.00/1]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Step</th>\n      <th>Training Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>"
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
      "Cell \u001B[0;32mIn[9], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m trainer_stats \u001B[38;5;241m=\u001B[39m \u001B[43mtrainer\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtrain\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[0;32m~/miniconda3/envs/11m10/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361\u001B[0m, in \u001B[0;36mSFTTrainer.train\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m    358\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mneftune_noise_alpha \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_trainer_supports_neftune:\n\u001B[1;32m    359\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmodel \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_trl_activate_neftune(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmodel)\n\u001B[0;32m--> 361\u001B[0m output \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtrain\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    363\u001B[0m \u001B[38;5;66;03m# After training we make sure to retrieve back the original forward pass method\u001B[39;00m\n\u001B[1;32m    364\u001B[0m \u001B[38;5;66;03m# for the embedding layer by removing the forward post hook.\u001B[39;00m\n\u001B[1;32m    365\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mneftune_noise_alpha \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_trainer_supports_neftune:\n",
      "File \u001B[0;32m~/miniconda3/envs/11m10/lib/python3.10/site-packages/transformers/trainer.py:1859\u001B[0m, in \u001B[0;36mTrainer.train\u001B[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001B[0m\n\u001B[1;32m   1857\u001B[0m         hf_hub_utils\u001B[38;5;241m.\u001B[39menable_progress_bars()\n\u001B[1;32m   1858\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1859\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43minner_training_loop\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m   1860\u001B[0m \u001B[43m        \u001B[49m\u001B[43margs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1861\u001B[0m \u001B[43m        \u001B[49m\u001B[43mresume_from_checkpoint\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mresume_from_checkpoint\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1862\u001B[0m \u001B[43m        \u001B[49m\u001B[43mtrial\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtrial\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1863\u001B[0m \u001B[43m        \u001B[49m\u001B[43mignore_keys_for_eval\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mignore_keys_for_eval\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1864\u001B[0m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[0;32m<string>:361\u001B[0m, in \u001B[0;36m_fast_inner_training_loop\u001B[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001B[0m\n",
      "File \u001B[0;32m~/miniconda3/envs/11m10/lib/python3.10/site-packages/transformers/trainer.py:3147\u001B[0m, in \u001B[0;36mTrainer.training_step\u001B[0;34m(self, model, inputs)\u001B[0m\n\u001B[1;32m   3145\u001B[0m         scaled_loss\u001B[38;5;241m.\u001B[39mbackward()\n\u001B[1;32m   3146\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 3147\u001B[0m     \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43maccelerator\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbackward\u001B[49m\u001B[43m(\u001B[49m\u001B[43mloss\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   3149\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m loss\u001B[38;5;241m.\u001B[39mdetach() \u001B[38;5;241m/\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39margs\u001B[38;5;241m.\u001B[39mgradient_accumulation_steps\n",
      "File \u001B[0;32m~/miniconda3/envs/11m10/lib/python3.10/site-packages/accelerate/accelerator.py:2011\u001B[0m, in \u001B[0;36mAccelerator.backward\u001B[0;34m(self, loss, **kwargs)\u001B[0m\n\u001B[1;32m   2009\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m\n\u001B[1;32m   2010\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mscaler \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m-> 2011\u001B[0m     \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mscaler\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mscale\u001B[49m\u001B[43m(\u001B[49m\u001B[43mloss\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbackward\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   2012\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m   2013\u001B[0m     loss\u001B[38;5;241m.\u001B[39mbackward(\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n",
      "File \u001B[0;32m~/miniconda3/envs/11m10/lib/python3.10/site-packages/torch/_tensor.py:522\u001B[0m, in \u001B[0;36mTensor.backward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m    512\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m has_torch_function_unary(\u001B[38;5;28mself\u001B[39m):\n\u001B[1;32m    513\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m handle_torch_function(\n\u001B[1;32m    514\u001B[0m         Tensor\u001B[38;5;241m.\u001B[39mbackward,\n\u001B[1;32m    515\u001B[0m         (\u001B[38;5;28mself\u001B[39m,),\n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m    520\u001B[0m         inputs\u001B[38;5;241m=\u001B[39minputs,\n\u001B[1;32m    521\u001B[0m     )\n\u001B[0;32m--> 522\u001B[0m \u001B[43mtorch\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mautograd\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbackward\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m    523\u001B[0m \u001B[43m    \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgradient\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mretain_graph\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcreate_graph\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43minputs\u001B[49m\n\u001B[1;32m    524\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[0;32m~/miniconda3/envs/11m10/lib/python3.10/site-packages/torch/autograd/__init__.py:266\u001B[0m, in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m    261\u001B[0m     retain_graph \u001B[38;5;241m=\u001B[39m create_graph\n\u001B[1;32m    263\u001B[0m \u001B[38;5;66;03m# The reason we repeat the same comment below is that\u001B[39;00m\n\u001B[1;32m    264\u001B[0m \u001B[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001B[39;00m\n\u001B[1;32m    265\u001B[0m \u001B[38;5;66;03m# calls in the traceback and some print out the last line\u001B[39;00m\n\u001B[0;32m--> 266\u001B[0m \u001B[43mVariable\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_execution_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrun_backward\u001B[49m\u001B[43m(\u001B[49m\u001B[43m  \u001B[49m\u001B[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001B[39;49;00m\n\u001B[1;32m    267\u001B[0m \u001B[43m    \u001B[49m\u001B[43mtensors\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    268\u001B[0m \u001B[43m    \u001B[49m\u001B[43mgrad_tensors_\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    269\u001B[0m \u001B[43m    \u001B[49m\u001B[43mretain_graph\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    270\u001B[0m \u001B[43m    \u001B[49m\u001B[43mcreate_graph\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    271\u001B[0m \u001B[43m    \u001B[49m\u001B[43minputs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    272\u001B[0m \u001B[43m    \u001B[49m\u001B[43mallow_unreachable\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m,\u001B[49m\n\u001B[1;32m    273\u001B[0m \u001B[43m    \u001B[49m\u001B[43maccumulate_grad\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m,\u001B[49m\n\u001B[1;32m    274\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n",
      "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
     ]
    }
   ],
   "source": [
    "trainer_stats = trainer.train()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-28T07:46:47.752762500Z",
     "start_time": "2024-04-28T07:45:04.839510700Z"
    }
   },
   "id": "761ff25233e50126",
   "execution_count": 9
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# alpaca_prompt = Copied from above\n",
    "FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n",
    "inputs = tokenizer(\n",
    "[\n",
    "    alpaca_prompt.format(\n",
    "        \"Continue the fibonnaci sequence.\", # instruction\n",
    "        \"1, 1, 2, 3, 5, 8\", # input\n",
    "        \"\", # output - leave this blank for generation!\n",
    "    )\n",
    "], return_tensors = \"pt\").to(\"cuda\")\n",
    "\n",
    "outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)\n",
    "tokenizer.batch_decode(outputs)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "fa1d2a5db1969ea7"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# alpaca_prompt = Copied from above\n",
    "FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n",
    "inputs = tokenizer(\n",
    "[\n",
    "    alpaca_prompt.format(\n",
    "        \"Continue the fibonnaci sequence.\", # instruction\n",
    "        \"1, 1, 2, 3, 5, 8\", # input\n",
    "        \"\", # output - leave this blank for generation!\n",
    "    )\n",
    "], return_tensors = \"pt\").to(\"cuda\")\n",
    "\n",
    "from transformers import TextStreamer\n",
    "text_streamer = TextStreamer(tokenizer)\n",
    "_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "6937d53f6a87412"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "model.save_pretrained(\"lora_model\") # Local saving\n",
    "tokenizer.save_pretrained(\"lora_model\")"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "954d6d285c9b38fb"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "if True:\n",
    "    from unsloth import FastLanguageModel\n",
    "    model, tokenizer = FastLanguageModel.from_pretrained(\n",
    "        model_name = \"lora_model\", # YOUR MODEL YOU USED FOR TRAINING\n",
    "        max_seq_length = max_seq_length,\n",
    "        dtype = dtype,\n",
    "        load_in_4bit = load_in_4bit,\n",
    "    )\n",
    "    FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n",
    "\n",
    "# alpaca_prompt = You MUST copy from above!\n",
    "\n",
    "inputs = tokenizer(\n",
    "[\n",
    "    alpaca_prompt.format(\n",
    "        \"What is a famous tall tower in Paris?\", # instruction\n",
    "        \"\", # input\n",
    "        \"\", # output - leave this blank for generation!\n",
    "    )\n",
    "], return_tensors = \"pt\").to(\"cuda\")\n",
    "\n",
    "outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)\n",
    "tokenizer.batch_decode(outputs)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "426e8495b6a71528"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
